Main theme: AI and Society
Hands-On: Introduction to Sentiment Analysis with R
Audience: High school students new to R
Duration: 2 hours
Environment: R and RStudio
Theme: AI and Society
KT Wong
kwanto@hku.hk
Faculty of Social Sciences, HKU
2025-07-31
Hands-On: Introduction to Sentiment Analysis with R
Audience: High school students new to R
Duration: 2 hours
Environment: R and RStudio
This 2-hour workshop introduces high school students to sentiment analysis using R in RStudio
we will analyze social media comments about AI’s societal impact
Roadmap
Learning Goals:
Objective: set up RStudio
Task 1.1: Open RStudio
File > New File > R Script
sentiment_workshop.R if neededTask 1.2: Install Packages
Objective: Load R packages and a dataset of comments
Dataset
library(tidyverse)
library(tidytext)
library(textdata)
comments <- tibble(
id = 1:30,
text = c(
"AI is amazing and will make education so much better!",
"I’m worried AI will take over jobs and leave people unemployed.",
"AI helps doctors save lives, it’s a game-changer.",
"I don’t trust AI, it feels creepy and invasive.",
"AI is okay, but it needs regulation to be safe.",
"AI in schools is cool, but it’s not perfect.",
"Wow, AI is so great, it’ll solve all our problems… yeah, right!", # Sarcasm
"AI makes healthcare faster and more accurate, love it!",
"Why does AI know so much about me? It’s unsettling.",
"AI chatbots are fun to talk to, but sometimes useless.",
"AI in movies is awesome, makes everything so realistic!",
"I’m scared AI will control everything one day.",
"AI helps me study better, it’s like a personal tutor.",
"AI is overhyped, it’s not as smart as people think.", # Mixed
"Using AI for art is creative and inspiring!",
"AI in cars? No way, I don’t trust self-driving tech.",
"AI makes my phone so smart, it’s incredible!",
"I feel like AI is watching me all the time, creepy.",
"AI in gaming makes battles so epic, I’m hooked!",
"AI might replace teachers, and that’s not cool.",
"AI saves time at work, but I miss human interaction.",
"AI’s fine, but it makes mistakes sometimes.", # Neutral
"AI in music creation is a total game-changer!",
"I’m skeptical about AI making fair decisions.",
"AI is great, but only if it’s used ethically.", # Mixed
"AI makes life easier, but it’s a bit scary too.", # Mixed
"AI in agriculture boosts crops, amazing stuff!",
"I don’t get why everyone loves AI so much.", # Negative
"AI tutors are helpful, but they don’t replace real teachers.",
"AI sounds cool, but I’m not sure it’s safe." # Mixed
)
)print(comments)view(comments) in the console
Objective: Learn tokenization to break text into words
Tokenization splits sentences into words
print(words)nrow(words)head(words, 5)n_distinct(words$word)words %>% count(word, sort = TRUE) %>% head(10)words %>% filter(word == "better")Objective: Understand how lexicons assign sentiment scores
A lexicon is a dictionary scoring words’ emotions
Here, AI uses lexicons to quantify feelings in text
Objective: Assign sentiment scores to words
Match dataset words to AFINN lexicon scores
print(sentiment_scores)sentiment_scores %>% filter(value < 0)sentiment_scores %>% filter(value > 0)Objective: Calculate total sentiment for each comment
Sum word scores per comment to get its overall sentiment
print(comment_sentiment)comment_sentiment %>% arrange(desc(total_score))comment_sentiment %>% filter(total_score == 0)Objective: Visualize sentiment and discuss AI’s societal impact
Create a bar plot to see positive/negative sentiments
ggplot(comment_sentiment, aes(x = id, y = total_score, fill = sentiment)) +
geom_bar(stat = "identity", na.rm = TRUE) +
geom_text(
data = filter(comment_sentiment, !is.na(total_score)),
aes(
label = total_score,
vjust = case_when(
total_score >= 0 ~ -0.3,
total_score < 0 ~ 1.3
)
)
) +
geom_text(
data = filter(comment_sentiment, is.na(total_score)),
aes(y = 0, label = "NA"),
vjust = -0.3,
color = "black",
size = 2
) +
labs(title = "Sentiment Scores of Comments about AI usign AFINN",
x = "Comment ID", y = "Sentiment Score") +
scale_fill_manual(
name = "Sentiment",
values = c("Negative" = "red", "Positive" = "blue"),
labels = c("Negative", "Positive"),
na.translate = FALSE # don't show NA in the legend
) +
scale_x_continuous(breaks = seq(2, 30, by = 2)) +
theme_minimal()geom_density() to visualize the distributiongeom_bar() to visualize counts of each sentiment labelObjective: Explore Bing and NRC sentiment lexicons as alternatives to AFINN
comment_sentiment_bing <- words_bing %>%
group_by(id, sentiment) %>%
summarise(word_count = n(), .groups = "drop") %>%
pivot_wider(names_from = sentiment, values_from = word_count, values_fill = 0) %>%
right_join(comments, by = "id") %>%
mutate(total_score = positive - negative) %>%
mutate(sentiment = case_when(
is.na(total_score) ~ NA_character_,
total_score > 0 ~ "Positive",
total_score < 0 ~ "Negative",
TRUE ~ "Neutral"
)) %>% arrange(id)words_nrc_pn <- words_nrc %>% filter(sentiment %in% c("positive", "negative"))
comment_sentiment_nrc <- words_nrc_pn %>%
group_by(id, sentiment) %>%
summarise(word_count = n(), .groups = "drop") %>%
pivot_wider(names_from = sentiment, values_from = word_count, values_fill = 0) %>%
right_join(comments, by = "id") %>%
mutate(total_score = positive - negative) %>%
mutate(sentiment = case_when(
is.na(total_score) ~ NA_character_,
total_score > 0 ~ "Positive",
total_score < 0 ~ "Negative",
TRUE ~ "Neutral"
)) %>% arrange(id)comment_sentiment_bing %>% filter(total_score > 0)comment_sentiment_nrc %>% filter(total_score > 0)geom_bar() to show counts of positive and negative wordsleft_join() to merge AFINN and Bing results by comment IDcomparison_df2 <- comments %>%
left_join(comment_sentiment_bing %>% select(id, sentiment), by = "id") %>%
rename(sentiment_bing = sentiment) %>%
left_join(comment_sentiment %>% select(id, sentiment), by = "id") %>%
rename(sentiment_afinn = sentiment)
comparison_df2
# show the comments where Bing and AFINN disagree
comparison_df2 %>%
filter(sentiment_bing != sentiment_afinn | is.na(sentiment_bing) != is.na(sentiment_afinn))geom_bar() to show counts of each sentiment per comment# Reshape the data to long format for plotting
comparison_long <- comparison_df2 %>%
select(id, sentiment_afinn, sentiment_bing) %>%
pivot_longer(cols = c(sentiment_afinn, sentiment_bing),
names_to = "lexicon",
values_to = "sentiment") %>%
mutate(lexicon = recode(lexicon,
sentiment_afinn = "AFINN",
sentiment_bing = "Bing"))
# Create a grouped bar plot to compare sentiment distributions
ggplot(comparison_long, aes(x = sentiment, fill = lexicon)) +
geom_bar(position = "dodge", alpha = 0.5) +
geom_text(stat = "count",
aes(label = after_stat(count), group = lexicon),
position = position_dodge(width = 0.45),
vjust = -0.5) +
labs(title = "Comparison of Sentiment Labels: AFINN vs Bing",
x = "Sentiment",
y = "Count",
fill = "Lexicon") +
scale_fill_manual(values = c("AFINN" = "blue", "Bing" = "red")) +
theme_minimal()Objective: Use Ollama with Llama 3.2:3b to perform sentiment analysis
Ollama runs large language models (LLMs) like Llama 3.2:3b locally
get_sentiment_ollama("AI is amazing and will make education so much better!")library(tidyr)
comparison3 <- comment_sentiment3 %>%
select(id, sentiment_afinn, sentiment_bing, sentiment_ollama) %>%
pivot_longer(cols = c(sentiment_afinn, sentiment_bing, sentiment_ollama),
names_to = "method",
values_to = "sentiment")
comparison_counts3 <- comparison3 %>%
count(method, sentiment)
ggplot(comparison_counts3, aes(x = sentiment, y = n, fill = method)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Sentiment Distribution Comparison",
x = "Sentiment", y = "Count") +
scale_fill_manual(name = "Method",
values = c("#1f77b4", "#ff7f0e", "#2ca02c"),
labels = c("AFINN (Lexicon)", "Bing (Lexicon)", "llama3.2:3b (LLM)")) +
scale_y_continuous(breaks = seq(0, max(comparison_counts3$n, na.rm = TRUE) + 3, by = 3)) +
theme_minimal()sentiment_lexicon != sentiment_ollamacomment_sentiment %>% filter(id %in% c(7, 14, 26))Does score match the text’s tone?
How does sentiment analysis help understand AI’s societal impact?
Companies: Improve AI based on feedback
Governments
Society: Highlight excitement for AI in education or healthcare
Wrap-up Questions:
Sentiment analysis is an AI tool to understand emotions in text
You’ve learned R to
Kaggle - Search “Twitter sentiment” for practice datasets
Analyze Social Media Data with Lexicon and LLM Methods
Objective: Practice everything learned using a real-world dataset
Step 1: Download and Load Data
afinn <- get_sentiments("afinn")
bing <- get_sentiments("bing")
nrc <- get_sentiments("nrc")
# Join and score with AFINN
real_sentiment_afinn <- real_words %>%
inner_join(afinn, by = "word") %>%
group_by(id) %>%
summarize(total_score = sum(value, na.rm = TRUE))
# Join and score with Bing
real_sentiment_bing <- real_words %>%
inner_join(bing, by = "word") %>%
group_by(id, sentiment.y) %>%
summarize(word_count = n(), .groups = "drop") %>%
pivot_wider(names_from = sentiment.y, values_from = word_count, values_fill = 0)
# Join and score with NRC (positive/negative)
real_sentiment_nrc <- real_words %>%
inner_join(nrc %>% filter(sentiment %in% c("positive", "negative")), by = "word") %>%
group_by(id, sentiment.y) %>%
summarize(word_count = n(), .groups = "drop") %>%
pivot_wider(names_from = sentiment.y, values_from = word_count, values_fill = 0)# AFINN
library(ggplot2)
ggplot(real_sentiment_afinn, aes(x = id, y = total_score)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "AFINN Sentiment Scores", x = "Comment ID", y = "Score") +
theme_minimal()
# Bing
ggplot(real_sentiment_bing, aes(x = id)) +
geom_bar(aes(y = positive), stat = "identity", fill = "blue", alpha = 0.5) +
geom_bar(aes(y = -negative), stat = "identity", fill = "red", alpha = 0.5) +
labs(title = "Bing Lexicon: Positive vs Negative", x = "Comment ID", y = "Word Count") +
theme_minimal()# If you have Ollama and Llama3 installed:
library(ollamar)
get_sentiment_ollama <- function(text) {
prompt <- paste("Classify the sentiment of the following text as positive, negative, or neutral, and respond with only the label in lower case:", text)
response <- generate(model = "llama3.2:3b", prompt = prompt, output="text")
return(response)
}
real_comments <- real_comments %>%
mutate(sentiment_ollama = map_chr(text, get_sentiment_ollama))Step 6: Compare and Discuss
Compare lexicon and LLM results
Which method best handles sarcasm, mixed emotions, or context?
Write a short paragraph (3–5 sentences) on your findings